# Setup file locations to import
beer_csv_loc <- "./StartingDBs/Beers.csv"
breweries_csv_loc <- "./StartingDBs/Breweries.csv"
# Read in data
beer_data <- read.csv(beer_csv_loc, header = TRUE)
brewery_data <- read.csv(breweries_csv_loc, header = TRUE)
# Data provided by Celia
# Celia's contributions prefied by cb_
cb_beer <- read.xlsx(file="./StartingDBs/Beers_Celia.xlsx", sheetIndex = 1,
header=TRUE, stringsAsFactors = FALSE)
cb_brewery <- read.xlsx(file="./StartingDBs/Breweries_Celia.xlsx", sheetIndex = 1,
header=TRUE, stringsAsFactors = FALSE)
# Create a seperate data frame to store state data
### Note: This data set counts the district of columbia as a state
num_of_breweries_by_state <- data.frame(table(brewery_data$State))
##### Celia Banks #####
#Begin prepare an interactive geo map of breweries by state
#set the map
leaflet() %>% addTiles()
#add popup for brewery information
brewery_map <- cb_brewery %>% mutate(popup_info=paste(Name,"<br/>",City,"<br>",State,"<br>"))
#tweak the map with info layout, set circles smaller to enhance visual
leaflet() %>% addTiles() %>% addCircleMarkers(data=brewery_map,
lat=~Latitude,
lng=~Longitude,
radius=~1,
popup=~popup_info)
#End prepare an interactive geo map of breweries by state#
#corresponding distribution plot of breweries by state
ggplot(brewery_map, aes(x=as.factor(State), color="blue")) +
geom_bar(color="blue") +
ggtitle("Distribution of Breweries by State") +
xlab("State") +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-.25) +
theme(legend.position="none", axis.text.x = element_text(angle = 90)) +
theme(plot.title = element_text(hjust = 0.5))
#corresponding distribution plot of breweries by region
ggplot(brewery_map, aes(x=as.factor(Region), color="green")) +
geom_bar(color="green") +
ggtitle("Distribution of Breweries by Region") +
xlab("Region") +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-.25) +
theme(legend.position="none", axis.text.x = element_text(angle = 90)) +
theme(plot.title = element_text(hjust = 0.5))
##### END CB #####
#the last six observations to check the merged file
# Change column name of beer_data to match brewery_data to use as a primary key
colnames(beer_data)[5] <- "Brew_ID"
# Merge two data bases using merge
full_brew_data <- merge(brewery_data, beer_data, by="Brew_ID")
# Rename columns 2 and 5 that were changed during the merge
colnames(full_brew_data)[2] <- "Brewery"
colnames(full_brew_data)[5] <- "Beer_Name"
# Print first 6 and last 6 observations
head(full_brew_data, n=6)
## Brew_ID Brewery City State Beer_Name Beer_ID ABV IBU
## 1 1 NorthGate Brewing Minneapolis MN Pumpion 2689 0.060 38
## 2 1 NorthGate Brewing Minneapolis MN Stronghold 2688 0.060 25
## 3 1 NorthGate Brewing Minneapolis MN Parapet ESB 2687 0.056 47
## 4 1 NorthGate Brewing Minneapolis MN Get Together 2692 0.045 50
## 5 1 NorthGate Brewing Minneapolis MN Maggie's Leap 2691 0.049 26
## 6 1 NorthGate Brewing Minneapolis MN Wall's End 2690 0.048 19
## Style Ounces
## 1 Pumpkin Ale 16
## 2 American Porter 16
## 3 Extra Special / Strong Bitter (ESB) 16
## 4 American IPA 16
## 5 Milk / Sweet Stout 16
## 6 English Brown Ale 16
tail(full_brew_data, n=6)
## Brew_ID Brewery City State
## 2405 556 Ukiah Brewing Company Ukiah CA
## 2406 557 Butternuts Beer and Ale Garrattsville NY
## 2407 557 Butternuts Beer and Ale Garrattsville NY
## 2408 557 Butternuts Beer and Ale Garrattsville NY
## 2409 557 Butternuts Beer and Ale Garrattsville NY
## 2410 558 Sleeping Lady Brewing Company Anchorage AK
## Beer_Name Beer_ID ABV IBU Style Ounces
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener 12
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12
## 2407 Snapperhead IPA 51 0.068 NA American IPA 12
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12
# Create CSV Files for head and tail
head_output_csv <- "./output/head_beer_data.csv"
tail_output_csv <- "./output/tail_been_data.csv"
write.csv(head(full_brew_data, n=6), head_output_csv)
write.csv(tail(full_brew_data, n=6), tail_output_csv)
# Create two new tables where one has only ABV with no <NA>'s and the other
state_abv <- data.frame(State=full_brew_data$State, ABV=full_brew_data$ABV)
state_abv <- state_abv %>% drop_na(ABV)
# Do the same for IBU
state_ibu <- data.frame(State=full_brew_data$State, IBU=full_brew_data$IBU)
state_ibu <- state_ibu %>% drop_na(IBU)
##state. Plot a bar chart
# Find the average ABV for the state_abv table
avg_state_abv <- ddply(state_abv, .(State), function(x) median(x$ABV))
colnames(avg_state_abv)[2] <- "Average_ABV" # Rename the column
# Drop DC. It's not a state....yet
avg_state_abv <- avg_state_abv[-c(8),] # DC is equal to position 8 on the table
# Order the states by Decending ABV content
avg_state_abv <- avg_state_abv[order(avg_state_abv$Average_ABV, decreasing=TRUE),]
# Find Average IBU
avg_state_ibu <- ddply(state_ibu, .(State), function(x) median(x$IBU))
colnames(avg_state_ibu)[2] <- "Average_IBU" # Rename column
# Order the states by IBU Decending
avg_state_ibu <- avg_state_ibu[order(avg_state_ibu$Average_IBU, decreasing=TRUE),]
# Plot Average Alcohol by Volume per state
ggplot(avg_state_abv, aes(x=reorder(State, desc(Average_ABV)), y=Average_ABV, fill=State)) +
geom_col(show.legend = FALSE, width=.9, position="dodge") +
ggtitle("Average ABV per State") +
xlab("State") +
ylab("Average Alcohol by Volume")
# Plot Average International Bitterness Unit per State
ggplot(avg_state_ibu, aes(x=reorder(State, desc(Average_IBU)), y=Average_IBU, fill=State)) +
geom_col(show.legend = FALSE, width=.9, position="dodge") +
ggtitle("Average IBU per State") +
xlab("State") +
ylab("Average IBU")
##### Celia Code #####
merge_bb <- merge(cb_beer, cb_brewery, by.x="Brewery_id", by.y="Brew_ID")
#Drop latitude and longitude columns as no longer needed
drop <- c("Latitude","Longitude")
cb_merged_bb <- merge_bb[,!(names(merge_bb) %in% drop)]
#rename columns to avoid confusion
#cb_merged_bb <- cb_merged_bb %>% rename(Beer_Name = Name.x, Brewery_Name = Name.y)
### RAH ###
colnames(cb_merged_bb)[2] <- "Beer_Name"
colnames(cb_merged_bb)[8] <- "Brewery_Name"
### End RAH ###
#get count of breweries in each state
brewery_count <- aggregate(Brewery_id ~ State, cb_merged_bb, sum)
aggregate(cb_merged_bb$Brewery_id, by=list(Category=cb_merged_bb$State), FUN=sum)
## Category x
## 1 AK 8363
## 2 AL 4208
## 3 AR 820
## 4 AZ 9815
## 5 CA 46773
## 6 CO 65472
## 7 CT 9222
## 8 DC 1824
## 9 DE 857
## 10 FL 15781
## 11 GA 4662
## 12 HI 8521
## 13 IA 11756
## 14 ID 8605
## 15 IL 15231
## 16 IN 12855
## 17 KS 2899
## 18 KY 1334
## 19 LA 4817
## 20 MA 21195
## 21 MD 4789
## 22 ME 8142
## 23 MI 15130
## 24 MN 8687
## 25 MO 10063
## 26 MS 1698
## 27 MT 16604
## 28 NC 20439
## 29 ND 1008
## 30 NE 8855
## 31 NH 1508
## 32 NJ 1892
## 33 NM 5103
## 34 NV 4653
## 35 NY 19400
## 36 OH 7974
## 37 OK 5483
## 38 OR 26890
## 39 PA 26078
## 40 RI 5802
## 41 SC 3108
## 42 SD 1491
## 43 TN 2073
## 44 TX 25427
## 45 UT 7289
## 46 VA 11011
## 47 VT 6459
## 48 WA 25471
## 49 WI 18638
## 50 WV 314
## 51 WY 4438
#impute the rows with zeroes
cb_merged_bb$ABV[is.na(cb_merged_bb$ABV)] <- 0
cb_merged_bb$IBU[is.na(cb_merged_bb$IBU)] <- 0
#compute median ABV and IBU for each state
cb_medianABV <- aggregate(ABV ~ State + Region, cb_merged_bb, median)
cb_medianIBU <- aggregate(IBU ~ State + Region, cb_merged_bb, median)
#merge the medians dataframes
cb_mergeABVIBU <- merge(cb_medianABV, cb_medianIBU, by="State")
drop <- c("Region.y")
cb_mergeABVIBU = merge_bb[,!(names(merge_bb) %in% drop)]
#Visualize ABV v IBU medians by state and by region
options(scipen = 999)
cb_mergeABVIBU <- as.data.frame(cb_mergeABVIBU)
#by state
ggplot(cb_mergeABVIBU, aes(ABV, IBU, color=State)) +
geom_point(show.legend = FALSE) +
facet_wrap(vars(State))
## Warning: Removed 1005 rows containing missing values (geom_point).
#by region
ggplot(cb_mergeABVIBU, aes(ABV, IBU, color=Region)) +
geom_point() +
facet_wrap(vars(Region))
## Warning: Removed 1005 rows containing missing values (geom_point).
##### END #####
# Get the State with the highest average ABV
state_max_abv <- avg_state_abv[which.max(avg_state_abv$Average_ABV),] # NV
# Get the State with the highest average IBU
state_max_ibu <- avg_state_ibu[which.max(avg_state_ibu$Average_IBU),] # WV
#and other types of Ale. Use KNN.
6. Comment on the Summary stats and distribution of the ABV variable
##7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content?